The following external packages are used:
import json
from pathlib import Path
# Standard plotly imports
import plotly.graph_objs as go
import plotly.plotly as py
from plotly.offline import iplot
# To create dataframes
import pandas as pd
# Cufflinks wrapper on plotly
import cufflinks
cufflinks.go_offline()
# Set the global theme for cufflinks
cufflinks.set_config_file(world_readable=True, theme='solar', offline=True)
cur_dir = !pwd
project_dir = Path(cur_dir[0]).resolve().parents[0]
data_path = project_dir / 'data' / 'processed'
training_set = json.load(open(data_path / 'training_data.json'))
# Using the sum of the x_counts vector as a feature for visualization
for e in training_set:
e['x_counts_sum'] = sum(e['x_counts'])
features = [
'controversiality',
'children',
'x_counts_sum',
'popularity_score',
]
ranges = {
features[0]: [0]*2,
features[1]: [0]*2,
features[2]: [0]*2,
features[3]: [0]*2,
}
for e in training_set:
for f in features:
ranges[f][0] = e[f] if e[f] < ranges[f][0] else ranges[f][0]
ranges[f][1] = e[f] if e[f] > ranges[f][1] else ranges[f][1]
df = pd.read_json(json.dumps(training_set))
df.sort_values(by=['popularity_score']).head()
data = [
go.Parcoords(
line = dict(color = df['popularity_score'],
colorscale = [[0,'#6A1B9A'],[0.6,'#E85285'],[1,'#FFECB3']]),
dimensions = list([
dict(range = [0,1],
label = 'is root',
values = df['is_root']
),
dict(range = ranges['children'],
label = 'children',
values = df['children']
),
dict(range = ranges['controversiality'],
label = 'controversiality',
values = df['controversiality']
),
#dict(range = ranges['x_counts_sum'],
# label = 'top 160 words sum (basic)',
# values = df['x_counts_sum']
# ),
dict(range = ranges['popularity_score'],
label = 'popularity',
values = df['popularity_score']
),
])
)
]
layout = go.Layout(title='Training data without text features')
training_no_text = go.Figure(data=data, layout=layout)
iplot(training_no_text)
Observations:
data = [
go.Parcoords(
line = dict(color = df['popularity_score'],
colorscale = [[0,'#6A1B9A'],[0.6,'#E85285'],[1,'#FFECB3']]),
dimensions = list([
dict(range = [0,1],
label = 'is root',
values = df['is_root']
),
dict(range = ranges['children'],
label = 'children',
values = df['children']
),
dict(range = ranges['controversiality'],
label = 'controversiality',
values = df['controversiality']
),
dict(range = ranges['x_counts_sum'],
label = 'top 160 words sum (basic)',
values = df['x_counts_sum']
),
dict(range = ranges['popularity_score'],
label = 'popularity',
values = df['popularity_score']
),
])
)
]
layout = go.Layout(title='Training data with top 160 words')
training_top_160_basic = go.Figure(data=data, layout=layout)
iplot(training_top_160_basic)
Observations:
High top-160-word-sum doesn't imply high popularity, which motivates more advanced text features.
# Shifting popularity scores to non negative values
df2 = df.copy()
df2['popularity_score'] += abs(ranges['popularity_score'][0])
df3 = df2[['popularity_score', 'x_counts_sum']].set_index('x_counts_sum')
df3.iplot(
kind='bar',
xTitle='x counts sum',
yTitle='popularity',
title='Popularity in terms of x counts sum')
Observations:
Similar observation as above for high top-160-word-count sum. There's a sweet spot between 0 and ~150 that hosts the most popular comments.